home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Freelog 125
/
Freelog_MarsAvril2015_No125.iso
/
Musique
/
Quod Libet
/
quodlibet-3.3.0-installer.exe
/
bin
/
quodlibet
/
parse
/
_diacritic.pyc
(
.txt
)
< prev
next >
Wrap
Python Compiled Bytecode
|
2014-12-31
|
9KB
|
311 lines
# Source Generated with Decompyle++
# File: in.pyc (Python 2.7)
'''
Ways to let ASCII characters match other unicode characters which
can be decomposed into one ASCII character and one or more combining
diacritic marks. This allows to match e.g. "M\xc3\xbam" using "Mum".
re_add_diacritic_variants(u"Mum") =>
u"[M\xe1\xb8\xbe\xe1\xb9\x80\xe1\xb9\x82][u\xc3\xb9\xc3\xba\xc3\xbb\xc3\xbc\xc5\xa9\xc5\xab\xc5\xad\xc5\xaf\xc5\xb1\xc5\xb3\xc6\xb0\xc7\x94\xc7\x96\xc7\x98\xc7\x9a\xc7\x9c\xc8\x95\xc8\x97\xe1\xb9\xb3\xe1\xb9\xb5\xe1\xb9\xb7\xe1\xb9\xb9\xe1\xb9\xbb\xe1\xbb\xa5\xe1\xbb\xa7\xe1\xbb\xa9\xe1\xbb\xab\xe1\xbb\xad\xe1\xbb\xaf\xe1\xbb\xb1][m\xe1\xb8\xbf\xe1\xb9\x81\xe1\xb9\x83]"
'''
import sre_parse
import unicodedata
import sys
from quodlibet.util import re_escape
_DIACRITIC_CACHE = {
u'\xcc\x80': u'AEINOUWYaeinouwy\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa5\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x85\xcf\x89\xd0\x95\xd0\x98\xd0\xb5\xd0\xb8',
u'\xcc\x80\xcd\x85': u'\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x81': u'ACEGIKLMNOPRSUWYZacegiklmnoprsuwyz\xc3\x86\xc3\x98\xc3\xa6\xc3\xb8\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa5\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x85\xcf\x89\xd0\x93\xd0\x9a\xd0\xb3\xd0\xba',
u'\xcc\x81\xcc\x87': u'Ss',
u'\xcc\x81\xcd\x85': u'\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x82': u'ACEGHIJOSUWYZaceghijosuwyz',
u'\xcc\x82\xcc\x80': u'AEOaeo',
u'\xcc\x82\xcc\x81': u'AEOaeo',
u'\xcc\x82\xcc\x83': u'AEOaeo',
u'\xcc\x82\xcc\x89': u'AEOaeo',
u'\xcc\x83': u'AEINOUVYaeinouvy',
u'\xcc\x83\xcc\x81': u'OUou',
u'\xcc\x83\xcc\x84': u'Oo',
u'\xcc\x83\xcc\x88': u'Oo',
u'\xcc\x84': u'AEGIOUYaegiouy\xc3\x86\xc3\xa6\xce\x91\xce\x99\xce\xa5\xce\xb1\xce\xb9\xcf\x85\xd0\x98\xd0\xa3\xd0\xb8\xd1\x83',
u'\xcc\x84\xcc\x80': u'EOeo',
u'\xcc\x84\xcc\x81': u'EOeo',
u'\xcc\x84\xcc\x88': u'Uu',
u'\xcc\x86': u'AEGIOUaegiou\xce\x91\xce\x99\xce\xa5\xce\xb1\xce\xb9\xcf\x85\xd0\x90\xd0\x95\xd0\x96\xd0\x98\xd0\xa3\xd0\xb0\xd0\xb5\xd0\xb6\xd0\xb8\xd1\x83',
u'\xcc\x86\xcc\x80': u'Aa',
u'\xcc\x86\xcc\x81': u'Aa',
u'\xcc\x86\xcc\x83': u'Aa',
u'\xcc\x86\xcc\x89': u'Aa',
u'\xcc\x87': u'ABCDEFGHIMNOPRSTWXYZabcdefghmnoprstwxyz',
u'\xcc\x87\xcc\x84': u'AOao',
u'\xcc\x88': u'AEHIOUWXYaehiotuwxy\xce\x99\xce\xa5\xce\xb9\xcf\x85\xd0\x86\xd0\x90\xd0\x95\xd0\x96\xd0\x97\xd0\x98\xd0\x9e\xd0\xa3\xd0\xa7\xd0\xab\xd0\xad\xd0\xb0\xd0\xb5\xd0\xb6\xd0\xb7\xd0\xb8\xd0\xbe\xd1\x83\xd1\x87\xd1\x8b\xd1\x8d\xd1\x96\xd3\x98\xd3\x99\xd3\xa8\xd3\xa9',
u'\xcc\x88\xcc\x80': u'Uu\xce\xb9\xcf\x85',
u'\xcc\x88\xcc\x81': u'IUiu\xce\xb9\xcf\x85',
u'\xcc\x88\xcc\x84': u'AOUaou',
u'\xcc\x88\xcc\x8c': u'Uu',
u'\xcc\x88\xcd\x82': u'\xce\xb9\xcf\x85',
u'\xcc\x89': u'AEIOUYaeiouy',
u'\xcc\x8a': u'AUauwy',
u'\xcc\x8a\xcc\x81': u'Aa',
u'\xcc\x8b': u'OUou\xd0\xa3\xd1\x83',
u'\xcc\x8c': u'ACDEGHIKLNORSTUZacdeghijklnorstuz\xc6\xb7\xca\x92',
u'\xcc\x8c\xcc\x87': u'Ss',
u'\xcc\x8f': u'AEIORUaeioru\xd1\xb4\xd1\xb5',
u'\xcc\x91': u'AEIORUaeioru',
u'\xcc\x93': u'\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x81\xcf\x85\xcf\x89',
u'\xcc\x93\xcc\x80': u'\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x85\xcf\x89',
u'\xcc\x93\xcc\x80\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x93\xcc\x81': u'\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x85\xcf\x89',
u'\xcc\x93\xcc\x81\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x93\xcd\x82': u'\xce\x91\xce\x97\xce\x99\xce\xa9\xce\xb1\xce\xb7\xce\xb9\xcf\x85\xcf\x89',
u'\xcc\x93\xcd\x82\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x93\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x94': u'\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa1\xce\xa5\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x81\xcf\x85\xcf\x89',
u'\xcc\x94\xcc\x80': u'\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa5\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x85\xcf\x89',
u'\xcc\x94\xcc\x80\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x94\xcc\x81': u'\xce\x91\xce\x95\xce\x97\xce\x99\xce\x9f\xce\xa5\xce\xa9\xce\xb1\xce\xb5\xce\xb7\xce\xb9\xce\xbf\xcf\x85\xcf\x89',
u'\xcc\x94\xcc\x81\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x94\xcd\x82': u'\xce\x91\xce\x97\xce\x99\xce\xa5\xce\xa9\xce\xb1\xce\xb7\xce\xb9\xcf\x85\xcf\x89',
u'\xcc\x94\xcd\x82\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x94\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89',
u'\xcc\x9b': u'OUou',
u'\xcc\x9b\xcc\x80': u'OUou',
u'\xcc\x9b\xcc\x81': u'OUou',
u'\xcc\x9b\xcc\x83': u'OUou',
u'\xcc\x9b\xcc\x89': u'OUou',
u'\xcc\x9b\xcc\xa3': u'OUou',
u'\xcc\xa3': u'ABDEHIKLMNORSTUVWYZabdehiklmnorstuvwyz',
u'\xcc\xa3\xcc\x82': u'AEOaeo',
u'\xcc\xa3\xcc\x84': u'LRlr',
u'\xcc\xa3\xcc\x86': u'Aa',
u'\xcc\xa3\xcc\x87': u'Ss',
u'\xcc\xa4': u'Uu',
u'\xcc\xa5': u'Aa',
u'\xcc\xa6': u'STst',
u'\xcc\xa7': u'CDEGHKLNRSTcdeghklnrst',
u'\xcc\xa7\xcc\x81': u'Cc',
u'\xcc\xa7\xcc\x86': u'Ee',
u'\xcc\xa8': u'AEIOUaeiou',
u'\xcc\xa8\xcc\x84': u'Oo',
u'\xcc\xad': u'DELNTUdelntu',
u'\xcc\xae': u'Hh',
u'\xcc\xb0': u'EIUeiu',
u'\xcc\xb1': u'BDKLNRTZbdhklnrtz',
u'\xcd\x82': u'\xce\xb1\xce\xb7\xce\xb9\xcf\x85\xcf\x89',
u'\xcd\x82\xcd\x85': u'\xce\xb1\xce\xb7\xcf\x89',
u'\xcd\x85': u'\xce\x91\xce\x97\xce\xa9\xce\xb1\xce\xb7\xcf\x89' }
def diacritic_for_letters(regenerate = False):
'''Returns a mapping for combining diacritic mark to ascii characters
for which they can be used to combine to a single unicode char.
(actually not ascii, but unicode from the Lu/Ll/Lt categories,
but mainly ascii)
Since this is quite expensive to compute, the result is a cached version
unless regenerate != True. regenerate = True is used for unittests
to validate the cache.
'''
if not regenerate:
return _DIACRITIC_CACHE
d = None
for i in xrange(sys.maxunicode):
u = unichr(i)
n = unicodedata.normalize('NFKD', u)
if len(n) <= 1:
continue
if unicodedata.category(u) not in ('Lu', 'Ll', 'Lt'):
continue
if not all(map(unicodedata.combining, n[1:])):
continue
d.setdefault(n[1:], set()).add(n[0])
for k, v in d.items():
d[k] = u''.join(sorted(v))
return d
def generate_re_diacritic_mapping(_diacritic_for_letters):
letter_to_variants = { }
for dia, letters in _diacritic_for_letters.iteritems():
for c in letters:
unichar = unicodedata.normalize('NFKC', c + dia)
letter_to_variants.setdefault(c, []).append(unichar)
for k, v in letter_to_variants.items():
letter_to_variants[k] = u''.join(sorted(v))
return letter_to_variants
def _fixup_literal(literal, in_seq, mapping):
u = unichr(literal)
if u in mapping:
u = u + mapping[u]
need_seq = len(u) > 1
u = re_escape(u)
if need_seq and not in_seq:
u = u'[%s]' % u
return u
def _fixup_not_literal(literal, mapping):
u = unichr(literal)
if u in mapping:
u = u + mapping[u]
u = re_escape(u)
return u'[^%s]' % u
def _fixup_range(start, end, mapping):
extra = []
for i in xrange(start, end + 1):
u = unichr(i)
if u in mapping:
extra.append(re_escape(mapping[u]))
continue
start = re_escape(unichr(start))
end = re_escape(unichr(end))
return u'%s%s-%s' % (''.join(extra), start, end)
def _construct_regexp(pattern, mapping):
'''Raises NotImplementedError'''
parts = []
for op, av in pattern:
if op == 'not_literal':
parts.append(_fixup_not_literal(av, mapping))
continue
if op == 'literal':
parts.append(_fixup_literal(av, False, mapping))
continue
if op == 'category':
cats = {
'category_word': u'\\w',
'category_not_word': u'\\W',
'category_digit': u'\\d',
'category_not_digit': u'\\D',
'category_space': u'\\s',
'category_not_space': u'\\S' }
try:
parts.append(cats[av])
except KeyError:
raise NotImplementedError(av)
if op == 'any':
parts.append(u'.')
continue
if op == 'negate':
parts.append(u'^')
continue
if op == 'in':
in_parts = []
for entry in av:
(op, eav) = entry
if op == 'literal':
in_parts.append(_fixup_literal(eav, True, mapping))
continue
in_parts.append(_construct_regexp([
entry], mapping))
parts.append(u'[%s]' % u''.join(in_parts))
continue
if op == 'range':
(start, end) = av
parts.append(_fixup_range(start, end, mapping))
continue
if op == 'max_repeat' or op == 'min_repeat':
(min_, max_, pad) = av
pad = _construct_regexp(pad, mapping)
if min_ == 1 and max_ == sre_parse.MAXREPEAT:
parts.append(u'%s+' % pad)
elif min_ == 0 and max_ == sre_parse.MAXREPEAT:
parts.append(u'%s*' % pad)
elif min_ == 0 and max_ == 1:
parts.append(u'%s?' % pad)
else:
parts.append(u'%s{%d,%d}' % (pad, min_, max_))
if op == 'min_repeat':
parts[-1] = parts[-1] + u'?'
if op == 'at':
ats = {
'at_beginning': u'^',
'at_end': u'$',
'at_beginning_string': u'\\A',
'at_boundary': u'\\b',
'at_non_boundary': u'\\B',
'at_end_string': u'\\Z' }
try:
parts.append(ats[av])
except KeyError:
raise NotImplementedError(av)
if op == 'subpattern':
(group, pad) = av
pad = _construct_regexp(pad, mapping)
if group is None:
parts.append(u'(?:%s)' % pad)
else:
parts.append(u'(%s)' % pad)
if op == 'assert':
(direction, pad) = av
pad = _construct_regexp(pad, mapping)
if direction == 1:
parts.append(u'(?=%s)' % pad)
elif direction == -1:
parts.append(u'(?<=%s)' % pad)
else:
raise NotImplementedError(direction)
if op == 'assert_not':
(direction, pad) = av
pad = _construct_regexp(pad, mapping)
if direction == 1:
parts.append(u'(?!%s)' % pad)
elif direction == -1:
parts.append(u'(?<!%s)' % pad)
else:
raise NotImplementedError(direction)
if op == 'branch':
(dummy, branches) = av
branches = (map,)((lambda b: _construct_regexp(b, mapping)), branches)
parts.append(u'%s' % u'|'.join(branches))
continue
raise NotImplementedError(op)
return u''.join(parts)
def re_replace_literals(text, mapping):
'''Raises NotImplementedError or re.error'''
if not isinstance(text, unicode):
raise AssertionError
pattern = None.parse(text)
return _construct_regexp(pattern, mapping)
_diacritic_mapping = generate_re_diacritic_mapping(diacritic_for_letters(regenerate = False))
def re_add_diacritic_variants(text):
'''Will replace all occurrences of ascii chars
by a bracket expression containing the character and all its
variants with a diacritic mark.
"f\xc3\xb6hn" -> "[f\xe1\xb8\x9f]\xc3\xb6[h\xc4\xa5\xc8\x9f\xe1\xb8\xa3\xe1\xb8\xa5\xe1\xb8\xa7\xe1\xb8\xa9\xe1\xb8\xab\xe1\xba\x96][n\xc3\xb1\xc5\x84\xc5\x86\xc5\x88\xc7\xb9\xe1\xb9\x85\xe1\xb9\x87\xe1\xb9\x89\xe1\xb9\x8b]"
In case the passed in regex is invalid raises re.error.
Supports all regexp except ones with group references. In
case something is not supported NotImplementedError gets raised.
'''
if not isinstance(text, unicode):
raise AssertionError
return None(text, _diacritic_mapping)